In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
from sklearn import ensemble

import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb

from sklearn import metrics
from sklearn import model_selection

import pvlib
import cs_detection

import visualize_plotly as visualize
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()
init_notebook_mode(connected=True)

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

Ground predictions

PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.

In [2]:
nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [3]:
len(nsrdb.df)
Out[3]:
315552

Train/test on NSRDB data to find optimal parameters

Default classifier

In [4]:
train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)
In [5]:
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [6]:
clf = ensemble.RandomForestClassifier(random_state=42)
In [7]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [8]:
feature_cols = [
    'tfn',
    'abs_ideal_ratio_diff grad',
    'abs_ideal_ratio_diff grad mean', 
    'abs_ideal_ratio_diff grad std',
    'abs_ideal_ratio_diff grad second',
    'abs_ideal_ratio_diff grad second mean',
    'abs_ideal_ratio_diff grad second std',
    'GHI Clearsky GHI pvlib line length ratio',
    'GHI Clearsky GHI pvlib ratio', 
    'GHI Clearsky GHI pvlib ratio mean',
    'GHI Clearsky GHI pvlib ratio std',
    'GHI Clearsky GHI pvlib diff',
    'GHI Clearsky GHI pvlib diff mean', 
    'GHI Clearsky GHI pvlib diff std'
]

target_cols = ['sky_status']
In [9]:
vis = visualize.Visualizer()
vis.plot_corr_matrix(train.df[feature_cols].corr(), feature_cols)
/Users/benellis/miniconda3/lib/python3.5/site-packages/seaborn/palettes.py:727: DeprecationWarning:

object of type <class 'float'> cannot be safely interpreted as an integer.

/Users/benellis/miniconda3/lib/python3.5/site-packages/seaborn/palettes.py:727: DeprecationWarning:

object of type <class 'float'> cannot be safely interpreted as an integer.

In [10]:
train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)
In [11]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[11]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)
In [12]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

In [13]:
metrics.accuracy_score(test.df['sky_status'], pred)
Out[13]:
0.94474885844748857
In [14]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
In [15]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
In [16]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

Gridsearch

In [17]:
import warnings
with warnings.catch_warnings(): warnings.simplefilter('ignore') params={} params['max_depth'] = [4, 8, 12, 16] params['n_estimators'] = [64] params['class_weight'] = [None, 'balanced'] params['min_samples_leaf'] = [1, 2, 3] results = [] for depth, nest, cw, min_samples in itertools.product(params['max_depth'], params['n_estimators'], params['class_weight'], params['min_samples_leaf']): print('Params:') print('depth: {}, n_estimators: {}, class_weight: {}, min_samples_leaf: {}'.format(depth, nest, cw, min_samples)) train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('01-01-1999', '01-01-2014') utils.calc_all_window_metrics(train2.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True) test2 = cs_detection.ClearskyDetection(train.df) test2.trim_dates('01-01-2014', '01-01-2015') clf = ensemble.RandomForestClassifier(max_depth=depth, n_estimators=nest, class_weight=cw, min_samples_leaf=min_samples, n_jobs=-1, random_state=42) clf.fit(train2.df[train2.df['GHI'] > 0][feature_cols].values, train2.df[train2.df['GHI'] > 0][target_cols].values.flatten()) print('\t Scores:') test_pred = test2.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True) accuracy_score = metrics.accuracy_score(test2.df['sky_status'], test_pred) print('\t\t accuracy: {}'.format(accuracy_score)) f1_score = metrics.f1_score(test2.df['sky_status'], test_pred) print('\t\t f1:{}'.format(f1_score)) recall_score = metrics.recall_score(test2.df['sky_status'], test_pred) print('\t\t recall:{}'.format(recall_score)) precision_score = metrics.precision_score(test2.df['sky_status'], test_pred) print('\t\t precision:{}'.format(precision_score)) results.append({'max_depth': depth, 'n_estimators': nest, 'class_weight': cw, 'min_samples_leaf': min_samples, 'accuracy': accuracy_score, 'f1': f1_score, 'recall': recall_score, 'precision': precision_score})runs_df = pd.DataFrame(results)runs_df.to_csv('8_abq_directional_features.csv')
In [18]:
runs_df = pd.read_csv('8_abq_directional_features.csv')
In [19]:
runs_df[['accuracy', 'f1', 'recall', 'precision']].iplot(kind='box')
In [20]:
runs_df
Out[20]:
Unnamed: 0 accuracy class_weight f1 max_depth min_samples_leaf n_estimators precision recall
0 0 0.947660 NaN 0.906743 4 1 64 0.886988 0.927398
1 1 0.947660 NaN 0.906743 4 2 64 0.886988 0.927398
2 2 0.947660 NaN 0.906743 4 3 64 0.886988 0.927398
3 3 0.946804 balanced 0.904801 4 1 64 0.888822 0.921365
4 4 0.946804 balanced 0.904801 4 2 64 0.888822 0.921365
5 5 0.946804 balanced 0.904801 4 3 64 0.888822 0.921365
6 6 0.950799 NaN 0.913262 8 1 64 0.884428 0.944040
7 7 0.950628 NaN 0.912969 8 2 64 0.884061 0.943832
8 8 0.950799 NaN 0.913105 8 3 64 0.885781 0.942168
9 9 0.950571 balanced 0.912614 8 1 64 0.886145 0.940711
10 10 0.950285 balanced 0.912082 8 2 64 0.885882 0.939879
11 11 0.950856 balanced 0.913004 8 3 64 0.887623 0.939879
12 12 0.950000 NaN 0.911587 12 1 64 0.885317 0.939463
13 13 0.949943 NaN 0.911495 12 2 64 0.885143 0.939463
14 14 0.950742 NaN 0.912925 12 3 64 0.886364 0.941128
15 15 0.949372 balanced 0.910177 12 1 64 0.886740 0.934887
16 16 0.948801 balanced 0.909072 12 2 64 0.886516 0.932806
17 17 0.949429 balanced 0.910269 12 3 64 0.886915 0.934887
18 18 0.949144 NaN 0.909790 16 1 64 0.886193 0.934679
19 19 0.948402 NaN 0.908668 16 2 64 0.883324 0.935511
20 20 0.949315 NaN 0.910321 16 3 64 0.884593 0.937591
21 21 0.948459 balanced 0.908705 16 1 64 0.883950 0.934887
22 22 0.948459 balanced 0.908575 16 2 64 0.885010 0.933430
23 23 0.949315 balanced 0.910067 16 3 64 0.886718 0.934679

Best recall model

In [21]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [22]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [23]:
best_recall = runs_df.iloc[runs_df['recall'].idxmax()]
In [24]:
params_recall = best_recall[['max_depth', 'n_estimators', 'min_samples_leaf']].to_dict()
In [25]:
params_recall
Out[25]:
{'max_depth': 8, 'min_samples_leaf': 1, 'n_estimators': 64}
clf = ensemble.RandomForestClassifier(**params_recall, n_jobs=-1) clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())test = cs_detection.ClearskyDetection(nsrdb.df) test.trim_dates('01-01-2015', None)pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only') vis.show()cm = metrics.confusion_matrix(test.df['sky_status'].values, pred) vis = visualize.Visualizer() vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])metrics.recall_score(test.df['sky_status'].values, pred)

Best accuracy model

In [26]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [27]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [28]:
best_accuracy = runs_df.iloc[runs_df['accuracy'].idxmax()]
In [29]:
print(best_accuracy.equals(best_recall))
False
In [30]:
params_accuracy = best_accuracy[['max_depth', 'n_estimators', 'class_weight', 'min_samples_leaf']].to_dict()
In [31]:
params_accuracy
Out[31]:
{'class_weight': 'balanced',
 'max_depth': 8,
 'min_samples_leaf': 3,
 'n_estimators': 64}
clf = ensemble.RandomForestClassifier(**params_accuracy, n_jobs=-1) clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())test = cs_detection.ClearskyDetection(nsrdb.df) test.trim_dates('01-01-2015', None)pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only') vis.show()cm = metrics.confusion_matrix(test.df['sky_status'].values, pred) vis = visualize.Visualizer() vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

Best precision model

In [32]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [33]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [34]:
best_precision = runs_df.iloc[runs_df['precision'].idxmax()]
In [35]:
print(best_precision.equals(best_recall))
print(best_precision.equals(best_accuracy))
False
False
In [36]:
params_precision = best_precision[['max_depth', 'n_estimators', 'class_weight', 'min_samples_leaf']].to_dict()
In [37]:
params_precision
Out[37]:
{'class_weight': 'balanced',
 'max_depth': 4,
 'min_samples_leaf': 1,
 'n_estimators': 64}
clf = ensemble.RandomForestClassifier(**params_precision, n_jobs=-1) clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())test = cs_detection.ClearskyDetection(nsrdb.df) test.trim_dates('01-01-2015', None)pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only') vis.show()cm = metrics.confusion_matrix(test.df['sky_status'].values, pred) vis = visualize.Visualizer() vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

Best f1 model

In [38]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [39]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [40]:
best_f1 = runs_df.iloc[runs_df['f1'].idxmax()]
In [41]:
print(best_f1.equals(best_recall))
print(best_f1.equals(best_accuracy))
print(best_f1.equals(best_precision))
True
False
False
In [42]:
best_f1
Out[42]:
Unnamed: 0                 6
accuracy            0.950799
class_weight             NaN
f1                  0.913262
max_depth                  8
min_samples_leaf           1
n_estimators              64
precision           0.884428
recall               0.94404
Name: 6, dtype: object
In [43]:
best_f1 = best_f1[['max_depth', 'min_samples_leaf', 'max_depth']].to_dict()

Same model as best recall - scroll up.

Train on all NSRDB data, test various freq of ground data

In [44]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf = ensemble.RandomForestClassifier(**best_f1, n_estimators=100)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[44]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [45]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

30 min freq ground data

In [46]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [47]:
test.trim_dates('10-01-2015', '11-01-2015')
In [48]:
test.df = test.df[test.df.index.minute % 30 == 0]
In [49]:
test.df.keys()
Out[49]:
Index(['GHI', 'Clearsky GHI pvlib', 'Clearsky GHI stat', 'sky_status pvlib',
       'Clearsky GHI stat smooth', 'ghi_status', 'scale'],
      dtype='object')
In [50]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [51]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
In [52]:
train2 = cs_detection.ClearskyDetection(nsrdb.df)
train2.intersection(test.df.index)
In [53]:
nsrdb_clear = train2.df['sky_status'].values
ml_clear = pred
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [54]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')
## 15 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-17-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')test.df = test.df[test.df.index.minute % 15 == 0] # test.df = test.df.resample('15T').apply(lambda x: x[len(x) // 2])pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 5, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-17-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='15min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')## 10 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-08-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 10 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 7, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-08-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='10min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')## 5 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-04-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 5 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 13, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-17-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='5min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')## 1 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-08-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 1 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 61, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-08-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='1min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')# Save modelimport picklewith open('8_abq_direction_features_model.pkl', 'wb') as f: pickle.dump(clf, f)!ls *abq*

Conclusion

In general, the clear sky identification looks good. At lower frequencies (30 min, 15 min) we see good agreement with NSRDB labeled points. I suspect this could be further improved my doing a larger hyperparameter search, or even doing some feature extraction/reduction/additions.

In [ ]: